# coding: utf-8
import sys
sys.getdefaultencoding()
import scrapy
import re
import logging
'''
爬虫：河南公共资源网
创建爬虫：scrapy genspider hnggzy "www.hnggzy.com"
'''
class HnggzySpider(scrapy.Spider):
    name = 'hnggzy'
    count = 0
    allowed_domains = ['www.hnggzy.com']
    # start_urls = ['http://www.hnggzy.com/']
    start_urls = ['http://www.hnggzy.com/hnsggzy/jyxx/002001/002001001/']
    def parse(self, response):
        tr_list = response.xpath("//table[@class='divlxyz']/tr")
        print(tr_list)
        for tr in tr_list:
            item = {}
            item["title"] = tr.xpath("./td[2]/a/text()").extract_first()
            url = tr.xpath("./td[2]/a/@href").extract_first()
            item["title_url"] = response.urljoin(url)
            time = tr.xpath("./td[3]/font/text()").extract_first()  # 发布时间
            item['title_date'] = re.sub('[\[\]]', '', time)
            item["come_from"] = '河南公共资源网'
            logging.warning(item)
            print(item)
            yield item
        # 下一页的连接
        url = response.xpath("//div[@class='pagemargin']//table/tr/td[15]/@onclick").extract_first()  # 获取相对url
        yema1 = url[-6:]  # 截取倒数八个字符
        yema = re.sub("\D", "", yema1)
        print("页码" + yema)
        next_link_url = "http://www.hnggzy.com/hnsggzy/jyxx/002001/002001001/?Paging=" + yema
        HnggzySpider.count += 1
        if(HnggzySpider.count<10):#爬取最近十页数据
            print("接下来爬取页数是：" + yema + "    链接是：" + next_link_url)
            logging.warning("接下来爬取页数是：" + yema + "    链接是：" + next_link_url)
            yield scrapy.Request(
                next_link_url,
                callback=self.parse,
                dont_filter=True
            )